Text Analytics

This notebook is for analyzing the Star Wars original trilogy scripts with various methods of text analysis.

## loading packages used in project.
## setting conflict preferences.

Sentiment Analysis

Load Star Wars data.

ep4<-fread("data/episode_iv.txt") %>%
        mutate(document = "a new hope") %>%
        mutate(line_number = row_number()) %>%
        rename(character = Character, 
               dialogue = Dialogue) %>%
        select(document, line_number, character, dialogue) %>%
        as_tibble() 

ep4 %>%
        mutate(line_number = row_number()) %>%
        select(line_number, character, dialogue)

We’ll tokenize this using tidytext. This enables us to display word counts, with and without stop words.

ep4 %>%
        unnest_tokens(word, dialogue) %>%
        group_by(document) %>%
        count(word, sort = TRUE) %>%
  slice_max(n, n=25) %>%
          mutate(word = reorder(word, n)) %>%
          ggplot(aes(n, word)) +
          geom_col() +
          labs(y = NULL)+
        theme_phil()+
        theme(axis.text.y=element_text(size=8))+
        facet_wrap(document ~.)

data(stop_words)

ep4 %>%
        unnest_tokens(word, dialogue) %>%
        anti_join(stop_words, by="word") %>%
        group_by(document) %>%
        count(word, sort = TRUE) %>%
  slice_max(n, n=25) %>%
          mutate(word = reorder(word, n)) %>%
          ggplot(aes(n, word)) +
          geom_col() +
          labs(y = NULL)+
        theme_phil()+
        theme(axis.text.y=element_text(size=8))+
        facet_wrap(document ~.)

We can similarly do this for each main character.

word_counts<- ep4 %>%
        unnest_tokens(word, dialogue) %>%
        #anti_join(stop_words, by="word") %>%
        group_by(document, character) %>%
        count(word) %>%
        group_by(document, character) %>%
        summarize(words = sum(n),
                  .groups = "drop") %>%
        arrange(desc(words)) %>%
        filter(words > 25)

word_counts %>%
        ggplot(., aes(y=reorder(tolower(character), words), x = words))+
        geom_col()+
        theme_phil()+
        ylab("character")+
        xlab("words spoken")+
        facet_wrap(document ~.)

# show how we get here        
word_runningcount<-ep4 %>%
        unnest_tokens(word, dialogue) %>%
        mutate(character = case_when(character == 'AUNT BERU' ~ 'BERU',
                                     TRUE ~ character)) %>%
        group_by(document, character) %>%
        mutate(main_character = case_when(character %in% (word_counts %>%
                                                  slice_max(words, n=10) %>%
                                                  pull(character))
                                                  ~ tolower(character),
               TRUE ~ "other")) %>%
        mutate(count = 1,
               running_count = cumsum(count),
               final = case_when(running_count == max(running_count) ~ character,
                                 TRUE ~ NA_character_))

library(randomcoloR)
n <- word_runningcount %>% group_by(document) %>% summarize(characters = n_distinct(character)) %>%
        pull(characters) 

set.seed(1)
palette <- distinctColorPalette(n)

# line chart
word_runningcount %>%
        ungroup() %>%
        mutate(word_number = row_number()) %>%
        ggplot(., aes(x=line_number, 
                                      y=running_count, 
                                      label = final,
                                      color = character, 
                                      group=character,
                                      by = main_character))+
                geom_line(lwd=1.5)+
                theme_phil()+
                geom_label_repel(nudge_x =1,
                                 nudge_y = 0.25)+
                guides(label = "none",
                       color = "none")+
                facet_wrap(document ~.)+
                scale_color_manual(values = palette)+
                labs(y="Word Count",
                     x ="Movie Line")

# count by character
ep4 %>%
        unnest_tokens(word, dialogue) %>%
        anti_join(stop_words,
                  by = "word") %>%
        mutate(character = case_when(character == 'AUNT BERU' ~ 'BERU',
                                     TRUE ~ character)) %>%
        group_by(document, character) %>%
        count(word, sort=T) %>%
        filter(character %in% (word_counts %>% slice_max(words, n=12) %>% pull(character))) %>%
        group_by(document, character) %>%
        mutate(rank = row_number()) %>%
        filter(rank<=10) %>%
        ungroup() %>%
        mutate(word = reorder_within(word, n, character)) %>%
        mutate(character = reorder(character, desc(n))) %>%
        ggplot(., aes(x=n, y=word, fill = character))+
        geom_col(show.legend=F)+
        scale_y_reordered()+
        facet_wrap(~character, ncol=3, scales="free_y")+
        theme_phil()+
        scale_fill_manual(values = distinctColorPalette(12))+
        theme(axis.text.y = element_text(size=8))

Sentiment Analysis

We can now use sentiment analysis to classify positive and negative words, which we can then count up using characters, etc. We’ll use the “bing” lexicon to start.

get_sentiments("bing") %>%
        sample_n(10)
# get sentiment
ep4_sentiment<-
ep4 %>%
        unnest_tokens(word, dialogue) %>%
        anti_join(stop_words, by="word") %>%
        mutate(character = case_when(character == 'AUNT BERU' ~ 'BERU',
                                     TRUE ~ character)) %>%
        inner_join(get_sentiments("bing"))
## Joining, by = "word"
# # plot line
# ep4_sentiment %>%
#         count(document, index = line_number %/% 1, sentiment) %>%
#         pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
#         mutate(sentiment = positive - negative) %>%
#         ggplot(., aes(x=index, y=sentiment, sentiment))+
#         geom_line()+
#         theme_phil()+
#         facet_wrap(document~.)

# plot bar
ep4_sentiment %>%
        count(document, index = line_number %/% 8, sentiment) %>%
        pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
        mutate(sentiment = positive - negative) %>%
        ggplot(., aes(x=index, y=sentiment, fill=sentiment))+
        geom_col()+
        theme_phil()+
        facet_wrap(document~.)+
        scale_fill_gradient2_tableau(limits=c(-4, 2), oob = scales::squish)+
        theme(legend.title = element_text(size=8))+
        guides(fill = guide_colourbar(title = "sentiment",
                                      title.position = "top",
                                      barwidth=8,
                                      barheight=0.5))

What are the most positive and negative sequences?

# most negative
ep4_sentiment %>%
        count(document, line_number, index = line_number %/% 8, sentiment) %>%
        pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
        mutate(sentiment = positive - negative) %>%
        slice_min(sentiment, n=5) %>%
        inner_join(., ep4) %>%
        select(document, character, line_number, sentiment, dialogue) %>%
        flextable() %>%
        flextable::autofit()
## Joining, by = c("document", "line_number")
# most positive
ep4_sentiment %>%
        count(document, line_number, index = line_number %/% 8, sentiment) %>%
        pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
        mutate(sentiment = positive - negative) %>%
        slice_max(sentiment, n=5) %>%
        inner_join(., ep4) %>%
        select(document, character, line_number, sentiment, dialogue) %>%
        flextable() %>%
        flextable::autofit()
## Joining, by = c("document", "line_number")

Hmm. The negative looks alright, but the positive is rather off. We’ll see how other approaches compare next, but let’s also check on characters.

Who are the most positive and negative characters?

# positive and negative characters
ep4_sentiment %>%
        group_by(document, character) %>%
        count(document, character, sentiment) %>%
        pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
        mutate(sentiment = positive - negative) %>%
        arrange(sentiment) %>%
        ggplot(., aes(y=reorder(character, sentiment), x=sentiment, fill = sentiment))+
        geom_col()+
        theme_phil()+
        scale_fill_gradient2_tableau(limits=c(-5, 2), oob = scales::squish)+
        theme(legend.title = element_text(size=8))+
        guides(fill = guide_colourbar(title = "sentiment",
                                      title.position = "top",
                                      barwidth=8,
                                      barheight=0.5))+
        ylab("character")+
        xlab("sentiment score")

Well, Luke is rather whiny, so…

Let’s try AFINN to see how its lexicon compares.

set.seed(10)
get_sentiments("afinn") %>%
        sample_n(10)

Repeat the analysis before.

library(textdata)

ep4 %>%
        unnest_tokens(word, dialogue) %>%
        anti_join(stop_words, by="word") %>%
        mutate(character = case_when(character == 'AUNT BERU' ~ 'BERU',
                                     TRUE ~ character)) %>%
        inner_join(get_sentiments("afinn")) %>%
        count(document, line_number, index = line_number %/% 8, value) %>%
        group_by(document, index) %>%
        summarize(sentiment = sum(value)) %>%
       # pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
        ggplot(., aes(x=index, y=sentiment, fill=sentiment))+
        geom_col()+
        theme_phil()+
        facet_wrap(document~.) +
        scale_fill_gradient2_tableau(limits=c(-8, 8), oob = scales::squish)+
        theme(legend.title = element_text(size=8))+
        guides(fill = guide_colourbar(title = "sentiment",
                                      title.position = "top",
                                      barwidth=8,
                                      barheight=0.5))

# check most positive as before
ep4 %>%
        unnest_tokens(word, dialogue) %>%
        anti_join(stop_words, by="word") %>%
        mutate(character = case_when(character == 'AUNT BERU' ~ 'BERU',
                                     TRUE ~ character)) %>%
        inner_join(get_sentiments("afinn")) %>%
        count(document, line_number, index = line_number %/% 8, value) %>%
        group_by(document, line_number) %>%
        summarize(sentiment = sum(value)) %>%
        slice_max(sentiment, n=5, with_ties = F) %>%
        inner_join(., ep4) %>%
        select(document, character, line_number, sentiment, dialogue) %>%
        flextable() %>%
        flextable::autofit()
# check most negative
ep4 %>%
        unnest_tokens(word, dialogue) %>%
        anti_join(stop_words, by="word") %>%
        mutate(character = case_when(character == 'AUNT BERU' ~ 'BERU',
                                     TRUE ~ character)) %>%
        inner_join(get_sentiments("afinn")) %>%
        count(document, line_number, index = line_number %/% 8, value) %>%
        group_by(document, line_number) %>%
        summarize(sentiment = sum(value)) %>%
        slice_min(sentiment, n=5, with_ties = F) %>%
        inner_join(., ep4) %>%
        select(document, character, line_number, sentiment, dialogue) %>%
        flextable() %>%
        flextable::autofit()
# and characters
ep4 %>%
        unnest_tokens(word, dialogue) %>%
        anti_join(stop_words, by="word") %>%
        mutate(character = case_when(character == 'AUNT BERU' ~ 'BERU',
                                     TRUE ~ character)) %>%
        inner_join(get_sentiments("afinn")) %>%
        count(document, character, line_number, index = line_number %/% 8, value) %>%
        group_by(document, character) %>%
        summarize(sentiment = sum(value)) %>%
        ggplot(., aes(y=reorder(character, sentiment), x=sentiment, fill = sentiment))+
        geom_col()+
        theme_phil()+
        scale_fill_gradient2_tableau(limits=c(-5, 2), oob = scales::squish)+
        theme(legend.title = element_text(size=8))+
        guides(fill = guide_colourbar(title = "sentiment",
                                      title.position = "top",
                                      barwidth=8,
                                      barheight=0.5))+
        ylab("character")+
        xlab("sentiment score")+
        facet_wrap(document~.)

Algorithmic Sentiment

We’re encountering some of the limits of these lexicon based sentiment approaches, in that they struggle to get the context of a word within a sentence.

We can use sentimentr to illustrate the algorithimic approach.

library(sentimentr)
## Warning: package 'sentimentr' was built under R version 4.1.1
library(magrittr)

This breaks text down into sentences and then assesses sentiment for each sentence. For example, we can look at Princess Leia’s recorded speech.

ep4 %>%
        filter(line_number == 270) %>%
        pull(dialogue)
## [1] "General Kenobi, years ago you served my father in the Clone Wars.  Now he begs you to help him in his struggle against the Empire.  I regret that I am unable to present my father's request to you in person, but my ship has fallen under attack and I'm afraid my mission to bring you to Alderaan has failed.  I have placed information vital to the survival of the Rebellion into the memory systems of this R2 unit.  My father will know how to retrieve it.  You must see this droid safely delivered to him on Alderaan.  This is our most desperate hour.  Help me, Obi-Wan Kenobi, you're my only hope."
ep4 %>%
        filter(line_number == 270) %>%
        get_sentences() %>%
        sentiment_by(by = c('character', 'dialogue')) %>%
        flextable() %>%
        flextable::autofit()
ep4 %>%
        filter(line_number == 270) %>%
        mutate(sentences = get_sentences(dialogue)) %$%
        sentiment_by(sentences, list(document, character, line_number)) %>%
        sentimentr::highlight()
## Saved in C:\Users\peh\AppData\Local\Temp\Rtmp0wBUYa/polarity.html
## Opening C:\Users\peh\AppData\Local\Temp\Rtmp0wBUYa/polarity.html ...
ep4 %>%
        filter(line_number == 803) %>%
        mutate(sentences = get_sentences(dialogue)) %$%
        sentiment_by(sentences, list(document, character, line_number)) %>%
        sentimentr::highlight()
## Saved in C:\Users\peh\AppData\Local\Temp\Rtmp0wBUYa/polarity.html
## Opening C:\Users\peh\AppData\Local\Temp\Rtmp0wBUYa/polarity.html ...

We can apply this to the entirety of the movie…

sentimentr_scores<-
ep4 %>%
        get_sentences() %>%
        sentiment_by(by = c('character', 'line_number'))

# plot by line number
sentimentr_scores %>%
        arrange(line_number) %>%
        mutate(index = line_number %/% 8) %>%
        ggplot(., aes(x=line_number, y=ave_sentiment, fill = ave_sentiment))+
        geom_col()+
        theme_phil()+
        scale_fill_gradient2_tableau(limits=c(-1, 1), oob = scales::squish)+
        guides(fill = guide_colourbar(title = "sentiment",
                                      title.position = "top",
                                      barwidth=8,
                                      barheight=0.5))

# summarize it a bit
sentimentr_scores %>%
        arrange(line_number) %>%
        mutate(index = line_number %/% 8) %>%
        group_by(index) %>%
        summarize(ave_sentiment = sum(ave_sentiment)) %>%
        ggplot(., aes(x=index, y=ave_sentiment, fill = ave_sentiment))+
        geom_col()+
        theme_phil()+
        scale_fill_gradient2_tableau(limits=c(-3, 3), oob = scales::squish)+
        guides(fill = guide_colourbar(title = "sentiment",
                                      title.position = "top",
                                      barwidth=8,
                                      barheight=0.5))

Grab distribution of sentiment for characters.

sentimentr_scores %>%
        group_by(character) %>%
        mutate(total_word_count = sum(word_count)) %>%
        filter(total_word_count > 100) %>%
        filter(ave_sentiment !=0) %>%
        ggplot(., aes(y=reorder(character, total_word_count), 
                     color = ave_sentiment,
                      x=ave_sentiment))+
        geom_boxplot(alpha=0.1)+
        geom_jitter(height=0.1, width=0)+
        theme_phil()+
        scale_color_gradient2_tableau(limits=c(-0.75, 0.75), oob = scales::squish)+
        #theme(legend.title = element_text(size=8))+
        guides(color = guide_colourbar(title = "average sentiment",
                                      title.position = "top",
                                      barwidth=8,
                                      barheight=0.5))+
        ylab("character")+
        xlab("average sentiment")+
  theme(panel.grid.major = element_blank())+
  geom_vline(xintercept = 0)

Well we gotta explore this. Does Luke get less whiny throughout the movie?

# cumulative sentiment
ep4 %>%
        filter(character== 'LUKE') %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        arrange(line_number) %>%
        mutate(row_number = row_number()) %>%
        filter(row_number > 65) %>%
        mutate(dialogue = as.character(dialogue)) %>%
        mutate(show_negative = case_when(abs(ave_sentiment) > .35 ~ dialogue,
                                         TRUE ~ "")) %>%
        mutate(run_sentiment = cumsum(ave_sentiment)) %>%
        ggplot(., aes(x=row_number, y=run_sentiment, label = show_negative))+
        geom_line()+
        geom_label_repel(size=2.5, max.overlaps=50)+
        theme_phil()+
        facet_wrap(document+character~.)+
        xlab("line_number")+
        ylab("running total of sentiment")
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# average senitment
ep4 %>%
        filter(character== 'LUKE') %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        arrange(line_number) %>%
        mutate(row_number = row_number()) %>%
       #filter(row_number > 65) %>%
        filter(word_count > 2) %>%
        filter(ave_sentiment != 0) %>%
        mutate(dialogue = as.character(dialogue)) %>%
        mutate(show_negative = case_when(abs(ave_sentiment) > .3 ~ dialogue,
                                         TRUE ~ "")) %>%
        mutate(run_sentiment = cumsum(ave_sentiment)) %>%
        ggplot(., aes(x=row_number, y=ave_sentiment, label = show_negative))+
        geom_point()+
        geom_line(linetype = 'dotted', lwd=0.5)+
        geom_label_repel(size=2.5, max.overlaps=50)+
        theme_phil()+
        facet_wrap(document+character~.)+
        ylab("average sentiment")+
        xlab("line_number")+
        geom_smooth(method = 'loess', formula = 'y ~ x')

Alright this is too much fun. Let’s look at some other characters.

# average senitment
ep4 %>%
        filter(character== 'HAN') %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        arrange(line_number) %>%
        filter(word_count > 2) %>%
        mutate(row_number = row_number()) %>%
       #filter(row_number > 65) %>%
        filter(ave_sentiment != 0) %>%
        mutate(dialogue = as.character(dialogue)) %>%
        mutate(show_negative = case_when(abs(ave_sentiment) > .3 ~ dialogue,
                                         TRUE ~ "")) %>%
        mutate(run_sentiment = cumsum(ave_sentiment)) %>%
        ggplot(., aes(x=row_number, y=ave_sentiment, label = show_negative))+
        geom_point()+
        geom_line(linetype = 'dotted', lwd=0.5)+
        geom_label_repel(size=2, max.overlaps=50)+
        theme_phil()+
        facet_wrap(document+character~., ncol=1)+
        ylab("average sentiment")+
        xlab("line_number")+
        geom_smooth(method = 'loess', formula = 'y ~ x')

# average senitment
ep4 %>%
        filter(character== 'THREEPIO') %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        arrange(line_number) %>%
        filter(word_count > 2) %>%
        mutate(row_number = row_number()) %>%
       #filter(row_number > 65) %>%
       filter(ave_sentiment != 0) %>%
        mutate(dialogue = as.character(dialogue)) %>%
        mutate(show_negative = case_when(abs(ave_sentiment) > .3 ~ dialogue,
                                         TRUE ~ "")) %>%
        mutate(run_sentiment = cumsum(ave_sentiment)) %>%
        ggplot(., aes(x=row_number, y=ave_sentiment, label = show_negative))+
        geom_point()+
        geom_line(linetype = 'dotted', lwd=0.5)+
        geom_label_repel(size=2.5, max.overlaps=50)+
        theme_phil()+
        facet_wrap(document+character~., ncol=1)+
        ylab("average sentiment")+
        xlab("line_number")+
        geom_smooth(method = 'loess', formula = 'y ~ x')

# most positive
ep4 %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        arrange(line_number) %>%
        group_by(line_number) %>%
        summarize(sum_sentiment = sum(ave_sentiment)) %>%
        arrange(desc(sum_sentiment)) %>%
        slice_max(sum_sentiment, n=5) %>%
        inner_join(., ep4) %>%
        mutate_if(is.numeric, round, 3) %>%
        select(document, character, line_number, sum_sentiment, dialogue) %>%
        flextable() %>%
        flextable::autofit()
## Joining, by = "line_number"
# most negative
ep4 %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        arrange(line_number) %>%
        group_by(line_number) %>%
        summarize(sum_sentiment = sum(ave_sentiment)) %>%
        arrange(desc(sum_sentiment)) %>%
        slice_min(sum_sentiment, n=5) %>%
        inner_join(., ep4) %>%
        mutate_if(is.numeric, round, 3) %>%
        select(document, character, line_number, sum_sentiment, dialogue) %>%
        flextable() %>%
        flextable::autofit()
## Joining, by = "line_number"

Let’s plot the cumulative sentiment for each movie and see how they compare.

# by point
ep4 %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        filter(word_count > 2) %>%
        arrange(line_number) %>%
        group_by(line_number) %>%
        mutate(dialogue = as.character(dialogue)) %>%
        summarize(sum_sentiment = sum(ave_sentiment)) %>%
        inner_join(., ep4) %>%
        mutate(running_sentiment = cumsum(sum_sentiment)) %>%
        mutate(row_number = row_number()) %>%
        mutate(show_negative = case_when(abs(sum_sentiment) > .6 ~ dialogue,
                                         TRUE ~ "")) %>%
        ggplot(., aes(x=row_number, y=running_sentiment, label = show_negative))+
        geom_point(size=1)+
        geom_line(linetype = 'dotted', lwd=0.9)+
        geom_label_repel(size=3, max.overlaps=30)+
        theme_phil()+
        facet_wrap(document+character~.)+
        ylab("running total of sentiment")+
        xlab("line_number")+
        facet_wrap(document~.)
## Joining, by = "line_number"
## Warning: ggrepel: 57 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

### Emotions

Get all emotions

# find some sentences
ep4 %>%
        get_sentences() %>%
        emotion() %>%
        arrange(desc(emotion))
# bar chart - percent of total
ep4 %>%
        get_sentences() %>%
        emotion() %>%
        mutate(emotion_type = as.character(emotion_type)) %>%
        filter(emotion_type == 'anger'|
                       emotion_type == 'disgust' |
                       emotion_type == 'sadness' |
                       emotion_type == 'fear' |
                       emotion_type == 'trust' |
                       emotion_type == 'anticipation' |
                       emotion_type == 'joy') %>%
        mutate(emotion_type = case_when((emotion_type == 'anger' | emotion_type == 'fear') ~ 'anger & fear',
                                         (emotion_type == 'trust' | emotion_type == 'joy') ~ 'trust & joy',
                                         TRUE ~ emotion_type)) %>%
       # mutate(running_emotion = cumsum(emotion)) %>% 
        mutate(index = line_number %/% 8) %>%
        group_by(index, emotion_type) %>%
        summarize(sum_emotion = sum(emotion)) %>%
        group_by(index) %>%
        arrange(sum_emotion) %>%
        #group_by(emotion_type) %>%
        #mutate(running_sum = cumsum(sum_emotion)) %>%
        ggplot(., aes(x=index, y=sum_emotion, fill=emotion_type, color = emotion_type, group=emotion_type))+
        geom_col(position = 'fill') +
        theme_phil()+
        scale_fill_colorblind()+
        scale_color_colorblind()
## `summarise()` has grouped output by 'index'. You can override using the `.groups` argument.

# area chart
ep4 %>%
        get_sentences() %>%
        emotion() %>%
        mutate(emotion_type = as.character(emotion_type)) %>%
        filter(emotion_type == 'anger'|
                       emotion_type == 'disgust' |
                       emotion_type == 'sadness' |
                       emotion_type == 'fear' |
                       emotion_type == 'trust' |
                       emotion_type == 'anticipation' |
                       emotion_type == 'joy') %>%
        mutate(emotion_type = case_when((emotion_type == 'anger' | emotion_type == 'fear') ~ 'anger & fear',
                                         (emotion_type == 'trust' | emotion_type == 'joy') ~ 'trust & joy',
                                         TRUE ~ emotion_type)) %>%
       # mutate(running_emotion = cumsum(emotion)) %>% 
        mutate(index = line_number %/% 8) %>%
        group_by(index, emotion_type) %>%
        summarize(sum_emotion = sum(emotion)) %>%
        group_by(emotion_type) %>%
        mutate(running_sum = cumsum(sum_emotion)) %>%
        ggplot(., aes(x=index, y=running_sum, fill=emotion_type, color = emotion_type, group=emotion_type))+
        geom_area()+
        theme_phil()+
        scale_fill_colorblind()+
        scale_color_colorblind()
## `summarise()` has grouped output by 'index'. You can override using the `.groups` argument.

# line chart of emotion
ep4 %>%
        get_sentences() %>%
        emotion() %>%
                filter(emotion_type == 'anger'|
                       emotion_type == 'disgust' |
                       emotion_type == 'sadness' |
                       emotion_type == 'fear' |
                       emotion_type == 'trust' |
                       emotion_type == 'anticipation' |
                       emotion_type == 'joy') %>%
        mutate(emotion_type = as.character(emotion_type)) %>%
        mutate(emotion_type = case_when((emotion_type == 'anger' | emotion_type == 'fear') ~ 'anger & fear',
                                         (emotion_type == 'trust' | emotion_type == 'joy') ~ 'trust & joy',
                                         TRUE ~ emotion_type)) %>%
       # mutate(running_emotion = cumsum(emotion)) %>% 
        mutate(index = line_number %/% 8) %>%
        group_by(index, emotion_type) %>%
        summarize(sum_emotion = sum(emotion)) %>%
        group_by(emotion_type) %>%
        mutate(running_sum = cumsum(sum_emotion)) %>%
        ggplot(., aes(x=index, y=running_sum, fill=emotion_type, color = emotion_type, group=emotion_type))+
        geom_line(lwd=1.1)+
        theme_phil()+
        scale_fill_colorblind()+
        scale_color_colorblind()
## `summarise()` has grouped output by 'index'. You can override using the `.groups` argument.

## Episode V and VI

ep5<-fread("data/episode_v.txt") %>%
        mutate(document = "the empire strikes back") %>%
        mutate(line_number = row_number()) %>%
        select(document, line_number, character, dialogue) %>%
        as_tibble() 


ep6<-fread("data/episode_vi.txt") %>%
        mutate(document = "return of the jedi") %>%
        mutate(line_number = row_number()) %>%
        select(document, line_number, character, dialogue) %>%
        as_tibble()
## Warning in fread("data/episode_vi.txt"): Found and resolved improper quoting
## out-of-sample. First healed line 191: <<"190" "LEIA" "He means \"You're welcome.
## \"">>. If the fields are not quoted (e.g. field separator does not appear within
## any field), try quote="" to avoid this warning.
starwars <- bind_rows(ep4, ep5, ep6)
ep4 %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        filter(word_count > 2) %>%
        arrange(line_number) %>%
        group_by(line_number) %>%
        mutate(dialogue = as.character(dialogue)) %>%
        summarize(sum_sentiment = sum(ave_sentiment)) %>%
        inner_join(., ep4) %>%
        mutate(running_sentiment = cumsum(sum_sentiment)) %>%
        mutate(row_number = row_number()) %>%
        mutate(show_negative = case_when(abs(sum_sentiment) > .6 ~ dialogue,
                                         TRUE ~ "")) %>%
        ggplot(., aes(x=row_number, y=running_sentiment, label = show_negative))+
        geom_point(size=0.5)+
        geom_line(linetype = 'dotted', lwd=0.5)+
        geom_label_repel(size=1.5, max.overlaps=45)+
        theme_phil()+
        facet_wrap(document+character~.)+
        ylab("running total of sentiment")+
        xlab("line_number")+
        facet_wrap(document~.)
## Joining, by = "line_number"
## Warning: ggrepel: 45 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ep5 %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        filter(word_count > 2) %>%
        arrange(line_number) %>%
        group_by(line_number) %>%
        mutate(dialogue = as.character(dialogue)) %>%
        summarize(sum_sentiment = sum(ave_sentiment)) %>%
        inner_join(., ep5) %>%
        mutate(running_sentiment = cumsum(sum_sentiment)) %>%
        mutate(row_number = row_number()) %>%
        mutate(show_negative = case_when(abs(sum_sentiment) > .6 ~ dialogue,
                                         TRUE ~ "")) %>%
        ggplot(., aes(x=row_number, y=running_sentiment, label = show_negative))+
        geom_point(size=0.5)+
        geom_line(linetype = 'dotted', lwd=0.5)+
        geom_label_repel(size=1.5, max.overlaps=45)+
        theme_phil()+
        facet_wrap(document+character~.)+
        ylab("running total of sentiment")+
        xlab("line_number")+
        facet_wrap(document~.)
## Joining, by = "line_number"
## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ep6 %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        filter(word_count > 2) %>%
        arrange(line_number) %>%
        group_by(line_number) %>%
        mutate(dialogue = as.character(dialogue)) %>%
        summarize(sum_sentiment = sum(ave_sentiment)) %>%
        inner_join(., ep6) %>%
        mutate(running_sentiment = cumsum(sum_sentiment)) %>%
        mutate(row_number = row_number()) %>%
        mutate(show_negative = case_when(abs(sum_sentiment) > .6 ~ dialogue,
                                         TRUE ~ "")) %>%
        ggplot(., aes(x=row_number, y=running_sentiment, label = show_negative))+
        geom_point(size=0.5)+
        geom_line(linetype = 'dotted', lwd=0.5)+
        geom_label_repel(size=1.5, max.overlaps=45)+
        theme_phil()+
        facet_wrap(document+character~.)+
        ylab("running total of sentiment")+
        xlab("line_number")+
        facet_wrap(document~.)
## Joining, by = "line_number"
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Palpy, just for John.

# without filtering word count
ep6 %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        filter(character == 'EMPEROR') %>%
        filter(ave_sentiment != 0) %>%
       #filter(word_count>2) %>%
        arrange(line_number) %>%
        mutate(dialogue = as.character(dialogue)) %>%
        mutate(running_sentiment = cumsum(ave_sentiment)) %>%
        mutate(row_number = row_number()) %>%
        mutate(show_negative = case_when(abs(ave_sentiment) > .3 ~ dialogue,
                                         TRUE ~ "")) %>%
        ggplot(., aes(x=row_number, y=ave_sentiment, label = show_negative))+
        geom_point(size=1)+
        geom_line(linetype = 'dotted', lwd=0.9)+
        geom_label_repel(size=3, max.overlaps=45)+
        theme_phil()+
        facet_wrap(document+character~.)+
        ylab("average sentiment")+
        xlab("line_number")+
        facet_wrap(character+document~.)+
        geom_smooth(method = 'loess', formula = 'y ~ x')

# with the filter
ep6 %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        filter(character == 'EMPEROR') %>%
        filter(ave_sentiment != 0) %>%
       filter(word_count>2) %>%
        arrange(line_number) %>%
        mutate(dialogue = as.character(dialogue)) %>%
        mutate(running_sentiment = cumsum(ave_sentiment)) %>%
        mutate(row_number = row_number()) %>%
        mutate(show_negative = case_when(abs(ave_sentiment) > .15 ~ dialogue,
                                         TRUE ~ "")) %>%
        ggplot(., aes(x=row_number, y=ave_sentiment, label = show_negative))+
        geom_point(size=0.9)+
        geom_line(linetype = 'dotted', lwd=0.5)+
        geom_label_repel(size=1.5, max.overlaps=45)+
        theme_phil()+
        facet_wrap(document+character~.)+
        ylab("average sentiment")+
        xlab("line_number")+
        facet_wrap(character+document~.)+
        geom_smooth(method = 'loess', formula = 'y ~ x')

Akbar

ep6 %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        filter(character == 'ACKBAR') %>%
        filter(ave_sentiment != 0) %>%
  #     filter(word_count>2) %>%
        arrange(line_number) %>%
        mutate(dialogue = as.character(dialogue)) %>%
        mutate(running_sentiment = cumsum(ave_sentiment)) %>%
        mutate(row_number = row_number()) %>%
        mutate(show_negative = case_when(abs(ave_sentiment) > .1 ~ dialogue,
                                         TRUE ~ "")) %>%
        ggplot(., aes(x=row_number, y=ave_sentiment, label = show_negative))+
        geom_point(size=0.5)+
        geom_line(linetype = 'dotted', lwd=0.5)+
        geom_label_repel(size=2.5, max.overlaps=45)+
        theme_phil()+
        facet_wrap(document+character~.)+
        ylab("average sentiment")+
        xlab("line_number")+
        facet_wrap(character+document~.)+
        geom_smooth(method = 'loess', formula = 'y ~ x')

John request: plot the big wigs.

big_wigs<-c('LUKE', 'LEIA', 'HAN', 'BEN', 'VADER', 'YODA', 'THREEPIO', 'LANDO', 'EMPEROR')

set.seed(30)
palette = distinctColorPalette(length(big_wigs))

starwars %>%
        mutate(character = case_when(document == 'the empire strikes back' & character == 'CREATURE' ~ 'YODA',
                                    TRUE ~ character)) %>%
        filter(character %in% big_wigs) %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        filter(ave_sentiment != 0) %>%
  #     filter(word_count>2) %>%
        arrange(line_number) %>%
        mutate(dialogue = as.character(dialogue)) %>%
        mutate(document = factor(document, levels = c('a new hope', 'the empire strikes back', 'return of the jedi'))) %>%
        group_by(document, character) %>%
        mutate(running_sentiment = cumsum(ave_sentiment)) %>%
  mutate(running_average = rollapply(ave_sentiment,2, mean,align='right',fill=NA)) %>%
        mutate(row_number = row_number()) %>%
        mutate(show_negative = case_when(abs(ave_sentiment) > .5 ~ dialogue,
                                         TRUE ~ "")) %>%
        mutate(max = case_when(row_number == max(row_number) ~ character,
               TRUE ~ NA_character_)) %>%
        ungroup() %>%
        mutate(row_number = row_number()) %>%
        ggplot(., aes(x=row_number, y=running_sentiment, group=character, color = character, label = max))+
     #   geom_point(size=0.25)+
        geom_line(lwd=1.1)+
        geom_label_repel(nudge_x = 1.5,
                                 nudge_y = 1)+
      #  geom_label_repel(size=1.5, max.overlaps=150)+
        theme_phil()+
        facet_wrap(document~., ncol=1)+
        ylab("running total of sentiment")+
        xlab("line_number")+
        scale_color_manual(values = palette)+
        guides(color = F)+
        theme(panel.grid.major = element_line(size = 0.25))+
  #      theme(panel.grid.major=element_blank())+
        geom_hline(yintercept = 0, linetype = 'dotted', col="grey60")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: Removed 1825 rows containing missing values (geom_label_repel).

# moving average
 rolling_mean <- rollify(mean, window = 3)


# starwars %>%
#         mutate(character = case_when(document == 'the empire strikes back' & character == 'CREATURE' ~ 'YODA',
#                                     TRUE ~ character)) %>%
#         filter(character %in% big_wigs) %>%
#         get_sentences() %>%
#         sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
#         filter(ave_sentiment != 0) %>%
#   #     filter(word_count>2) %>%
#         arrange(line_number) %>%
#         mutate(dialogue = as.character(dialogue)) %>%
#         mutate(document = factor(document, levels = c('a new hope', 'the empire strikes back', 'return of the jedi'))) %>%
#         group_by(document, character) %>%
#         mutate(running_sentiment = cumsum(ave_sentiment)) %>%
#   mutate(running_average = rolling_mean(ave_sentiment)) %>%
#         mutate(row_number = row_number()) %>%
#         mutate(show_negative = case_when(abs(ave_sentiment) > .5 ~ dialogue,
#                                          TRUE ~ "")) %>%
#         mutate(max = case_when(row_number == max(row_number) ~ character,
#                TRUE ~ NA_character_)) %>%
#         ungroup() %>%
#         mutate(row_number = row_number()) %>%
#         ggplot(., aes(x=line_number, y=running_average, group=character, color = character, label = max))+
#      #   geom_point(size=0.25)+
#         geom_line(lwd=1.1)+
#         geom_label_repel(nudge_x = 1.5,
#                                  nudge_y = 1)+
#       #  geom_label_repel(size=1.5, max.overlaps=150)+
#         theme_phil()+
#         facet_wrap(document~., ncol=1)+
#         ylab("moving average of sentiment")+
#         xlab("line_number")+
#         scale_color_manual(values = palette)+
#         guides(color = F)+
#         theme(panel.grid.major = element_line(size = 0.25))+
#   #      theme(panel.grid.major=element_blank())+
#         geom_hline(yintercept = 0, linetype = 'dotted', col="grey60")
# 

If you just grab the average/sum sentiment, who comes out on top?

starwars %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        group_by(document, character) %>%
        filter(ave_sentiment !=0) %>%
        summarize(sentences = n_distinct(line_number),
                  avg_sentiment = mean(ave_sentiment),
                  sum_sentiment = sum(ave_sentiment)) %>%
        filter(sentences > 5) %>%
    #    arrange(desc(sentences))
        ggplot(., aes(x=avg_sentiment,
                      fill=avg_sentiment,
                      y=reorder_within(character, avg_sentiment, document)))+
        geom_col()+
        scale_y_reordered()+
        facet_wrap(~document, ncol =1, scales="free_y")+
        theme_phil()+
        scale_fill_gradient2_tableau()+
        guides(fill="none")+
        theme(axis.text.y = element_text(size=8))+
        ylab("")+
        xlab("Average Sentiment")
## `summarise()` has grouped output by 'document'. You can override using the `.groups` argument.

starwars %>%
        get_sentences() %>%
        sentiment_by(by = c('document', 'character', 'line_number', 'dialogue')) %>%
        group_by(document, character) %>%
        mutate(document = factor(document, levels = c("a new hope", "the empire strikes back", "return of the jedi"))) %>%
        filter(ave_sentiment !=0) %>%
        summarize(sentences = n_distinct(line_number),
                  avg_sentiment = mean(ave_sentiment),
                  sum_sentiment = sum(ave_sentiment)) %>%
        filter(sentences > 5) %>%
    #    arrange(desc(sentences))
        ggplot(., aes(x=sum_sentiment,
                      fill=sum_sentiment,
                      y=reorder_within(character, sum_sentiment, document)))+
        geom_col()+
        scale_y_reordered()+
        facet_wrap(~document, ncol =1, scales="free_y")+
        theme_phil()+
     #  scale_fill_gradient2_tableau()+
        guides(fill=F)+
        theme(axis.text.y = element_text(size=8))+
        ylab("")+
        xlab("Summed Sentiment")
## `summarise()` has grouped output by 'document'. You can override using the `.groups` argument.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

TF-IDF

Let’s compute the tf-idf by movie.

starwars_tf_idf<-starwars %>%
  unnest_tokens(word, dialogue) %>%
  count(document, word, sort=T) %>%
  left_join(., starwars %>%
              unnest_tokens(word, dialogue) %>%
              count(document, word, sort=T) %>%
              group_by(document) %>%
              summarize(total = sum(n))) %>%
  bind_tf_idf(word, document, n) %>%
  arrange(desc(tf_idf))
## Joining, by = "document"
starwars_tf_idf
starwars_tf_idf %>%
  mutate(document = factor(document, levels = c('a new hope', 'the empire strikes back', 'return of the jedi'))) %>%
  group_by(document) %>%
  slice_max(tf_idf, n=20, with_ties = F) %>%
  ungroup() %>%
  ggplot(., aes(tf_idf, reorder_within(word, tf_idf, document), fill = document)) +
  geom_col(show.legend = F) +
  facet_wrap(~document, ncol=2, scales="free")+
  theme_phil()+
  labs(x="tf_idf", y="")+
  theme(axis.text.y = element_text(size=8))+
  scale_y_reordered()+
  scale_fill_colorblind()

N-Grams and Correlations

starwars_bigrams<-starwars %>%
  unnest_tokens(bigram, dialogue, token = "ngrams", n = 2)

bigrams_separated<-starwars_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts %>%
  filter(!is.na(word1))
library(igraph)
bigram_graph <- bigram_counts %>%
  filter(!is.na(word1)) %>%
  filter(n > 2) %>%
  graph_from_data_frame()

library(ggraph)
## Warning: package 'ggraph' was built under R version 4.1.1
set.seed(32) 

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

Topic Modeling

Let’s apply topic modeling to Star Wars. This requires converting to a document term matrix

library(tm)
## Warning: package 'tm' was built under R version 4.1.1
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:fabletools':
## 
##     features
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 4.1.1
starwars_dtm<-starwars %>%
  unnest_tokens(word, dialogue) %>%
  anti_join(stop_words) %>%
  count(document, line_number, word, sort=T) %>%
  mutate(document_line_number = paste(document, line_number, sep="_")) %>%
  cast_dtm(document_line_number, word, n) 
## Joining, by = "word"
starwars_dtm %>%
  tidy() %>%
  head()
starwars_lda <- LDA(starwars_dtm, k = 4, control = list(seed = 1234))
starwars_lda
## A LDA_VEM topic model with 4 topics.

Now opening up the LDA

tidy(starwars_lda, matrix="beta") %>%
  group_by(topic) %>%
  slice_max(beta, n=25) %>%
  ungroup() %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()+
  theme_phil()

Document (line number) probabilities by topic.

# topic 1
tidy(starwars_lda, matrix="gamma") %>%
  mutate(topic = paste0("topic", topic)) %>%
  pivot_wider(names_from = topic, values_from = gamma) %>%
  arrange(document) %>%
  left_join(., starwars %>%
              mutate(document = paste(document, line_number, sep="_"))) %>%
  select(document, starts_with('topic'), character, dialogue) %>%
  arrange(desc(topic1)) %>%
  mutate_if(is.numeric, round, 3) %>%
  head(10) %>%
  flextable() %>%
  flextable::autofit()
## Joining, by = "document"
# topic 2
tidy(starwars_lda, matrix="gamma") %>%
  mutate(topic = paste0("topic", topic)) %>%
  pivot_wider(names_from = topic, values_from = gamma) %>%
  arrange(document) %>%
  left_join(., starwars %>%
              mutate(document = paste(document, line_number, sep="_"))) %>%
  select(document, starts_with('topic'), character, dialogue) %>%
  arrange(desc(topic2)) %>%
  mutate_if(is.numeric, round, 3) %>%
  head(10) %>%
  flextable() %>%
  flextable::autofit()
## Joining, by = "document"
# topic 3
tidy(starwars_lda, matrix="gamma") %>%
  mutate(topic = paste0("topic", topic)) %>%
  pivot_wider(names_from = topic, values_from = gamma) %>%
  arrange(document) %>%
  left_join(., starwars %>%
              mutate(document = paste(document, line_number, sep="_"))) %>%
  select(document, starts_with('topic'), character, dialogue) %>%
  arrange(desc(topic3)) %>%
  mutate_if(is.numeric, round, 3) %>%
  head(10) %>%
  flextable() %>%
  flextable::autofit()
## Joining, by = "document"
# topic 4
tidy(starwars_lda, matrix="gamma") %>%
  mutate(topic = paste0("topic", topic)) %>%
  pivot_wider(names_from = topic, values_from = gamma) %>%
  arrange(document) %>%
  left_join(., starwars %>%
              mutate(document = paste(document, line_number, sep="_"))) %>%
  select(document, starts_with('topic'), character, dialogue) %>%
  arrange(desc(topic4)) %>%
  mutate_if(is.numeric, round, 3) %>%
  head(10) %>%
  flextable() %>%
  flextable::autofit()
## Joining, by = "document"